library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(expss)
library(ClusterR)
library(DESeq2) ; library(biomaRt)
library(knitr)

Load preprocessed dataset (preprocessing code in 01_data_preprocessing.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
DE_info = DE_info %>% data.frame
datMeta = datMeta %>% mutate(ID = title)

# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)

# Update DE_info with Neuronal information
DE_info = DE_info %>% mutate('entrezgene'=rownames(.) %>% as.numeric) %>% 
          dplyr::rename('padj' = adj.P.Val, 'log2FoldChange' = logFC) %>%
          left_join(datGenes %>% dplyr::select(entrezgene, ensembl_gene_id) %>% 
                    dplyr::rename('ID' = ensembl_gene_id), by = 'entrezgene') %>%
          left_join(GO_neuronal, by='ID') %>%
          mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
          mutate(significant=padj<0.05 & !is.na(padj))


rm(GO_annotations)


Mean Level of Expression


All samples together


  • There seems to be a small group of genes with lower mean expression than most genes

  • There are some samples with lower level of expression than most samples

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))

p1 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
     xlab('Mean Expression') + ylab('Density') + ggtitle('Mean Expression distribution by Gene') +
     theme_minimal()

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))

p2 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
     xlab('Mean Expression') + ylab('Density') +
     theme_minimal() + ggtitle('Mean expression distribution by Sample')

grid.arrange(p1, p2, nrow=1)

rm(p1, p2, plot_data)

Grouping samples by Phenotype


The differences in level of expression between Phenotype information are not statistically significant

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% left_join(datMeta, by='ID')

p1 = plot_data %>% ggplot(aes(Ethnicity, Mean, fill = Ethnicity)) + 
     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
     stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
     xlab('Batch') + ylab('Mean Expression') + theme_minimal() + theme(legend.position = 'none')

p2 = plot_data %>% ggplot(aes(Sex, Mean, fill = Sex)) + 
     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
     stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
     xlab('Gender') + ylab('') + theme_minimal() + theme(legend.position = 'none')

grid.arrange(p1,p2, nrow = 1)

rm(p1,p2)

Grouping genes by Neuronal tag and samples by Diagnosis


  • The two groups of genes seem to be partially characterised by genes with Neuronal function

  • In general, the ASD group has a lower mean than the control group (opposite to Gandal and Gupta’s results)

  • Only the differences in mean expression between Neuronal and non-neuronal genes are statistically significant

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>% 
            left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))

p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
                   theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression by gene')

p3 = plot_data %>% ggplot(aes(Neuronal, Mean, fill = Neuronal)) + 
                   geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
                   stat_compare_means(label = 'p.signif', method = 't.test', 
                                      method.args = list(var.equal = FALSE)) + theme_minimal() + 
                   ylab('Mean Expression') + theme(legend.position = 'none')

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% left_join(datMeta, by='ID')

p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + geom_density(alpha=0.3) +
                   theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression by Sample')

p4 = plot_data %>% ggplot(aes(Diagnosis, Mean, fill = Diagnosis)) + 
                   geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
                   stat_compare_means(label = 'p.signif', method = 't.test', 
                                      method.args = list(var.equal = FALSE)) + theme_minimal() +
                   ylab('Mean Expression') + theme(legend.position = 'none')


grid.arrange(p1, p2, p3, p4, nrow=2)

rm(GO_annotations, plot_data, p1, p2, p3, p4)


Grouping genes and samples by Diagnosis

In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by gene

plot_data = data.frame('ID'=rownames(datExpr),
                       'ASD'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']),
                       'CTL'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD'])) %>%
                       mutate(diff=ASD-CTL, abs_diff = abs(ASD-CTL)) %>%
                       mutate(std_diff = (diff-mean(diff))/sd(diff), distance = abs((diff-mean(diff))/sd(diff)))

plot_data %>% ggplot(aes(ASD, CTL, color = distance)) + geom_point(alpha = plot_data$abs_diff) + 
              geom_abline(color = 'gray') + scale_color_viridis(direction = -1) + 
              ggtitle('Mean expression ASD vs CTL') + theme_minimal() + coord_fixed()

summary(plot_data$std_diff)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -13.3392  -0.2758   0.0368   0.0000   0.3367  11.8097
#cat(paste0('Outlier genes: ', paste(plot_data$ID[abs(plot_data$std_diff)>3], collapse = ', ')))

There are 372 genes with a difference between Diagnoses larger than 3 SD to the distance distribution of all genes. Gene ENSG00000205670 has the largest difference in mean expression between ASD and CTL


  • There doesn’t seem to be a noticeable difference between mean expression by gene between Diagnosis groups

  • Samples with autism tend to have lower values than the control group (as we had already seen above)

plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + theme_minimal())

plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + theme_minimal() +
              ggtitle('Mean expression by Gene (left) and by Sample (right) grouped by Diagnosis'))

subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)




Visualisations


Samples


PCA


Samples seems to separate relatively well by Diagnosis

ASD samples seem to be more evenly spread out than the Control samples

pca = datExpr %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            left_join(datMeta, by='ID') %>% dplyr::select('ID','PC1','PC2','Diagnosis') %>% 
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))

plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(alpha = 0.8) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              theme_minimal() + ggtitle('PCA of Samples')

rm(pca, plot_data)


MDS


Looks exactly the same as the PCA visualisation, just inverting the both axes

d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)

plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>% left_join(datMeta, by='ID') %>% 
            dplyr::select('C1','C2','Diagnosis') %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))

plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point(alpha = 0.8) + theme_minimal() + ggtitle('MDS')

rm(d, fit, plot_data)


t-SNE


T-SNE seems to be struggling to separate the samples by Diagnosis

Using a perplexity of 2 the ASD samples seem to gather in the center and Controls in the periphery

perplexities = c(1,2,5,10)
ps = list()

for(i in 1:length(perplexities)){
  set.seed(123)
  tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
  plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>% 
              left_join(datMeta, by='ID') %>%
              dplyr::select('C1','C2','Diagnosis') %>%
              mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() +
            ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(ps, perplexities, tsne, i)


Genes


PCA


  • The First Principal Component explains over 97% of the total variance

  • There’s a really strong correlation between the mean expression of a gene and the 1st principal component

  • The magnitude of the second principal component seems to be related to the level of expression of the genes (this didn’t happen in the other datasets)

pca = datExpr %>% prcomp

plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() + 
              scale_color_viridis() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)


t-SNE


Higher perplexities capture a cleaner visualisation of the data ordered by mean expression, in a similar (although not as linear) way to PCA

perplexities = c(1,2,5,10,50,100)
ps = list()

for(i in 1:length(perplexities)){
  tsne = read.csv(paste0('./../Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
  plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
            scale_color_viridis() + ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(perplexities, ps, tsne, i)




Differential Expression Analysis


Only 2 genes (~0.013% vs ~28% in Gandal’s dataset) are significant using a threshold of 0.05 for the adjusted p-value and a without a log Fold Change threshold (keeping the null hypothesis \(H_0: LFC=0\))

table(DE_info$padj<0.05, useNA='ifany')
## 
## FALSE  TRUE 
## 15390     2
p = DE_info %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) + 
    scale_y_sqrt() + xlab('log2 Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)

rm(p)
plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% left_join(DE_info, by='ID')

plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange))) + 
              geom_point(alpha = 0.3, color='#0099cc') + geom_smooth(method='lm', color = 'gray') + 
              theme_minimal() + scale_y_sqrt() + theme(legend.position = 'bottom') +
              xlab('Mean Expression') + ylab('LFC Magnitude') + 
              ggtitle('Log fold change by level of expression')

List of DE genes

# Get genes names
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=plot_data$ID, mart=mart) %>% 
             rename(external_gene_id = 'gene_name', ensembl_gene_id = 'ID')

top_genes = plot_data %>% left_join(gene_names, by='ID') %>% filter(padj<0.05) %>% arrange(-abs(log2FoldChange))

kable(top_genes %>% dplyr::select(ID, gene_name, log2FoldChange, padj, Neuronal))
ID gene_name log2FoldChange padj Neuronal
ENSG00000133316 WDR74 -2.5677696 0.0000011 0
ENSG00000101412 E2F1 -0.4326021 0.0143666 0
rm(top_genes)



Effects of modifying the log fold change threshold


No point in doing this having only 2 DE genes




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.28                  biomaRt_2.40.5             
##  [3] DESeq2_1.24.0               SummarizedExperiment_1.14.1
##  [5] DelayedArray_0.10.0         BiocParallel_1.18.1        
##  [7] matrixStats_0.56.0          Biobase_2.44.0             
##  [9] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [11] IRanges_2.18.3              S4Vectors_0.22.1           
## [13] BiocGenerics_0.30.0         ClusterR_1.2.1             
## [15] gtools_3.8.2                expss_0.10.2               
## [17] Rtsne_0.15                  ggpubr_0.2.5               
## [19] magrittr_1.5                GGally_1.5.0               
## [21] gridExtra_2.3               viridis_0.5.1              
## [23] viridisLite_0.3.0           RColorBrewer_1.1-2         
## [25] plotlyutils_0.0.0.9000      plotly_4.9.2               
## [27] glue_1.4.1                  reshape2_1.4.4             
## [29] forcats_0.5.0               stringr_1.4.0              
## [31] dplyr_1.0.0                 purrr_0.3.4                
## [33] readr_1.3.1                 tidyr_1.1.0                
## [35] tibble_3.0.1                ggplot2_3.3.2              
## [37] tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] gmp_0.5-13.6           crosstalk_1.1.0.1      digest_0.6.25         
##  [10] htmltools_0.4.0        fansi_0.4.1            checkmate_2.0.0       
##  [13] memoise_1.1.0          cluster_2.1.0          annotate_1.62.0       
##  [16] modelr_0.1.6           prettyunits_1.1.1      jpeg_0.1-8.1          
##  [19] colorspace_1.4-1       blob_1.2.1             rvest_0.3.5           
##  [22] haven_2.2.0            xfun_0.12              crayon_1.3.4          
##  [25] RCurl_1.98-1.2         jsonlite_1.7.0         genefilter_1.66.0     
##  [28] survival_3.1-12        gtable_0.3.0           zlibbioc_1.30.0       
##  [31] XVector_0.24.0         scales_1.1.1           DBI_1.1.0             
##  [34] miniUI_0.1.1.1         Rcpp_1.0.4.6           xtable_1.8-4          
##  [37] progress_1.2.2         htmlTable_1.13.3       foreign_0.8-76        
##  [40] bit_1.1-15.2           Formula_1.2-3          htmlwidgets_1.5.1     
##  [43] httr_1.4.1             acepack_1.4.1          ellipsis_0.3.1        
##  [46] pkgconfig_2.0.3        reshape_0.8.8          XML_3.99-0.3          
##  [49] farver_2.0.3           nnet_7.3-14            dbplyr_1.4.2          
##  [52] locfit_1.5-9.4         later_1.0.0            tidyselect_1.1.0      
##  [55] labeling_0.3           rlang_0.4.6            AnnotationDbi_1.46.1  
##  [58] munsell_0.5.0          cellranger_1.1.0       tools_3.6.3           
##  [61] cli_2.0.2              generics_0.0.2         RSQLite_2.2.0         
##  [64] broom_0.5.5            fastmap_1.0.1          evaluate_0.14         
##  [67] yaml_2.2.1             bit64_0.9-7            fs_1.4.0              
##  [70] nlme_3.1-147           mime_0.9               ggExtra_0.9           
##  [73] xml2_1.2.5             compiler_3.6.3         rstudioapi_0.11       
##  [76] curl_4.3               png_0.1-7              ggsignif_0.6.0        
##  [79] reprex_0.3.0           geneplotter_1.62.0     stringi_1.4.6         
##  [82] highr_0.8              lattice_0.20-41        Matrix_1.2-18         
##  [85] vctrs_0.3.1            pillar_1.4.4           lifecycle_0.2.0       
##  [88] data.table_1.12.8      bitops_1.0-6           httpuv_1.5.2          
##  [91] R6_2.4.1               latticeExtra_0.6-29    promises_1.1.0        
##  [94] assertthat_0.2.1       withr_2.2.0            GenomeInfoDbData_1.2.1
##  [97] mgcv_1.8-31            hms_0.5.3              grid_3.6.3            
## [100] rpart_4.1-15           rmarkdown_2.1          shiny_1.4.0.2         
## [103] lubridate_1.7.4        base64enc_0.1-3